import numpy as np
import pandas as pd
from pandas import DataFrame,Series
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy import stats
import warnings
from sklearn.preprocessing import OneHotEncoder
warnings.filterwarnings('ignore')
#from sklearn.model_selection import train_test_split
def pretty_print_linear(coefs, names = None, sort = False):
if names is None:
names = ["X%s" % x for x in range(len(coefs))]
lst = zip(coefs, names)
if sort:
lst = sorted(lst, key = lambda x:-np.abs(x[0]))
return " + ".join("%s * %s" % (coef , name) for coef, name in lst)
def scale_data(X):
scaler = StandardScaler()
X = scaler.fit_transform(X)
return X
def split_data(X,Y):
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
return X_train, X_test, Y_train, Y_test
def root_mean_square_error(y_pred,y_test):
rmse_train = np.sqrt(np.dot(abs(y_pred-y_test),abs(y_pred-y_test))/len(y_test))
return rmse_train
def plot_real_vs_predicted(y_pred,y_test):
plt.plot(y_pred,y_test,'ro')
plt.plot([0,50],[0,50], 'g-')
plt.xlabel('predicted')
plt.ylabel('real')
plt.show()
return plt
def generate_regression_values(model, X, y):
params = np.append(model.intercept_, model.coef_)
predictions = model.predict(X)
newX = pd.DataFrame({"Constant": np.ones(len(X))}).join(pd.DataFrame(X))
MSE = (sum((y - predictions) ** 2)) / (len(newX) - len(newX.columns))
# Note if you don't want to use a DataFrame replace the two lines above with
# newX = np.append(np.ones((len(X),1)), X, axis=1)
# MSE = (sum((y-predictions)**2))/(len(newX)-len(newX[0]))
var_b = MSE * (np.linalg.inv(np.dot(newX.T, newX)).diagonal())
sd_b = np.sqrt(var_b)
ts_b = params / sd_b
p_values = [2 * (1 - stats.t.cdf(np.abs(i), (len(newX) - 1))) for i in ts_b]
sd_b = np.round(sd_b, 3)
ts_b = np.round(ts_b, 3)
p_values = np.round(p_values, 3)
params = np.round(params, 4)
myDF3 = pd.DataFrame()
myDF3["Coefficients"], myDF3["Standard Errors"], myDF3["t values"], myDF3[
"Probabilites"
] = [params, sd_b, ts_b, p_values]
print(myDF3)
health_score = pd.read_csv('Synthetic_Health_Data.csv').drop(['Unnamed: 0'],axis=1)
health_score.head(5)
| Age | Height | Weight | BMI | Fat% | sleep_hours | Exercise_Time | gender | Glasses_Of_Water_PerWeek | Alcohol_Servings_PerWeek | ... | 30min_Cardio_PerWeek | Intensity_of_Cardio | Strength_Training_PerWeek | Smoke_PerDay | Chew_Tobacco | Healthy_WorkLife_Balance | Stressed | steps_PerMonth | sitting_hours | health_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 37 | 166 | 82.080267 | 28.256774 | 18.947780 | 5.296131 | 0.106583 | female | 0-5 glass | none | ... | More than 3 times a week | Light | Light | 15+ cigrattes | Yes | Sometimes | Sometimes | 87612 | 2 | 45.653149 |
| 1 | 26 | 167 | 63.702687 | 22.286450 | 1.602097 | 5.619734 | 1.057065 | male | 6-9 glass | none | ... | 1-2 times week | Very Hard | No | Never Smoked | No | No | No | 73339 | 10 | 49.455607 |
| 2 | 32 | 177 | 66.277449 | 21.593621 | 27.185321 | 5.521740 | 0.831857 | female | 0-5 glass | none | ... | 1-2 times week | Light | Hard | Never Smoked | Yes | Yes | No | 106329 | 11 | 54.382471 |
| 3 | 44 | 178 | 86.076920 | 27.062654 | 12.032952 | 8.714550 | 0.028077 | male | 0-5 glass | 4+ servings | ... | Never or Rarely | Hard | Very Hard | Quit smoking | No | Sometimes | Yes | 121816 | 4 | 55.461273 |
| 4 | 25 | 153 | 36.960478 | 16.427001 | 15.187378 | 6.295620 | 0.103930 | female | 6-9 glass | 4+ servings | ... | Never or Rarely | No | No | 15+ cigrattes | No | No | Yes | 128999 | 2 | 38.095238 |
5 rows × 26 columns
Exploratory Data Analytics
import matplotlib.pyplot as plt, seaborn as sns
import matplotlib.style as style
sns.pairplot(health_score, hue ='gender')
<seaborn.axisgrid.PairGrid at 0x246f88e5f90>
import matplotlib.pyplot as plt, seaborn as sns
import matplotlib.style as style
sns.pairplot(health_score, hue ='Smoke_PerDay')
<seaborn.axisgrid.PairGrid at 0x246fff56c50>
cat_cols = ['gender',
'Glasses_Of_Water_PerWeek',
'Alcohol_Servings_PerWeek',
'Three_or_More_Servings_of_whole_grain_perDay',
'Eat_Nuts_or_Fish_2_or_more_times_perWeek',
'Butter_or_Cheese_or_Cream_Milk_Or_Curd_2_or_more_times_perWeek',
'Sweet_Treats_Consumption_on_most_days_of_the_week',
'Fried_or_Junk_food_Consumption_on_most_days_of_the_week',
'Five_or_More_fruits_and_vegetables_perDay',
'30min_Cardio_PerWeek',
'Intensity_of_Cardio',
'Strength_Training_PerWeek',
'Smoke_PerDay',
'Chew_Tobacco',
'Healthy_WorkLife_Balance',
'Stressed']
for col in cat_cols:
health_score[col] = pd.Categorical(health_score[col], categories=health_score[col].unique()).codes
health_score.head(5)
| Age | Height | Weight | BMI | Fat% | sleep_hours | Exercise_Time | gender | Glasses_Of_Water_PerWeek | Alcohol_Servings_PerWeek | ... | 30min_Cardio_PerWeek | Intensity_of_Cardio | Strength_Training_PerWeek | Smoke_PerDay | Chew_Tobacco | Healthy_WorkLife_Balance | Stressed | steps_PerMonth | sitting_hours | health_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 37 | 166 | 82.080267 | 28.256774 | 18.947780 | 5.296131 | 0.106583 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 87612 | 2 | 45.653149 |
| 1 | 26 | 167 | 63.702687 | 22.286450 | 1.602097 | 5.619734 | 1.057065 | 1 | 1 | 0 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 73339 | 10 | 49.455607 |
| 2 | 32 | 177 | 66.277449 | 21.593621 | 27.185321 | 5.521740 | 0.831857 | 0 | 0 | 0 | ... | 1 | 0 | 2 | 1 | 0 | 2 | 1 | 106329 | 11 | 54.382471 |
| 3 | 44 | 178 | 86.076920 | 27.062654 | 12.032952 | 8.714550 | 0.028077 | 1 | 0 | 1 | ... | 2 | 2 | 3 | 2 | 1 | 0 | 2 | 121816 | 4 | 55.461273 |
| 4 | 25 | 153 | 36.960478 | 16.427001 | 15.187378 | 6.295620 | 0.103930 | 0 | 1 | 1 | ... | 2 | 3 | 1 | 0 | 1 | 1 | 2 | 128999 | 2 | 38.095238 |
5 rows × 26 columns
Linear Regression
health_score.drop(['Age', 'Height', 'Weight', 'Fat%','gender'],axis=1,inplace=True)
health_score.head(5)
| BMI | sleep_hours | Exercise_Time | Glasses_Of_Water_PerWeek | Alcohol_Servings_PerWeek | Three_or_More_Servings_of_whole_grain_perDay | Eat_Nuts_or_Fish_2_or_more_times_perWeek | Butter_or_Cheese_or_Cream_Milk_Or_Curd_2_or_more_times_perWeek | Sweet_Treats_Consumption_on_most_days_of_the_week | Fried_or_Junk_food_Consumption_on_most_days_of_the_week | ... | 30min_Cardio_PerWeek | Intensity_of_Cardio | Strength_Training_PerWeek | Smoke_PerDay | Chew_Tobacco | Healthy_WorkLife_Balance | Stressed | steps_PerMonth | sitting_hours | health_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 28.256774 | 5.296131 | 0.106583 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 87612 | 2 | 45.653149 |
| 1 | 22.286450 | 5.619734 | 1.057065 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 73339 | 10 | 49.455607 |
| 2 | 21.593621 | 5.521740 | 0.831857 | 0 | 0 | 1 | 2 | 0 | 1 | 1 | ... | 1 | 0 | 2 | 1 | 0 | 2 | 1 | 106329 | 11 | 54.382471 |
| 3 | 27.062654 | 8.714550 | 0.028077 | 0 | 1 | 1 | 2 | 1 | 2 | 1 | ... | 2 | 2 | 3 | 2 | 1 | 0 | 2 | 121816 | 4 | 55.461273 |
| 4 | 16.427001 | 6.295620 | 0.103930 | 1 | 1 | 2 | 0 | 1 | 1 | 1 | ... | 2 | 3 | 1 | 0 | 1 | 1 | 2 | 128999 | 2 | 38.095238 |
5 rows × 21 columns
Y=health_score['health_score']
X=health_score.drop(['health_score'],axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y ,random_state=104,test_size=0.25,shuffle=True)
#X_train= X_train.values.reshape(-1, 1)
#X_test = X_test.values.reshape(-1, 1)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
(2250, 20) (750, 20) (2250,) (750,)
from sklearn import metrics
# Create linear regression object
linreg = LinearRegression()
# Train the model using the training sets
linreg.fit(X_train,Y_train)
print ("Linear model: ", pretty_print_linear(linreg.coef_, X.columns, sort = True))
# Predict the values using the model
Y_lin_predict = linreg.predict(X_test)
#print(linreg.score(X_test, Y_test))
# Print the root mean square error
#print ("Root Mean Square Error: {}".format(root_mean_square_error(Y_lin_predict,Y_test)))
#plot_real_vs_predicted(Y_test,Y_lin_predict)
Linear model: -4.848617224854395 * 30min_Cardio_PerWeek + 4.7868889669633665 * Chew_Tobacco + 3.0946881325138045 * Exercise_Time + 2.5569400557061015 * Strength_Training_PerWeek + 2.1586465302020272 * Glasses_Of_Water_PerWeek + 1.406972030131514 * Eat_Nuts_or_Fish_2_or_more_times_perWeek + 1.3276233924417706 * Fried_or_Junk_food_Consumption_on_most_days_of_the_week + -1.2315789840104598 * Alcohol_Servings_PerWeek + 1.2063649658481126 * Sweet_Treats_Consumption_on_most_days_of_the_week + -1.1455053784707674 * Butter_or_Cheese_or_Cream_Milk_Or_Curd_2_or_more_times_perWeek + 1.1415350083619613 * Five_or_More_fruits_and_vegetables_perDay + 1.1323464648438006 * Stressed + -1.0249103074577206 * Intensity_of_Cardio + 1.0101659157820206 * Three_or_More_Servings_of_whole_grain_perDay + 0.925866882468655 * Healthy_WorkLife_Balance + 0.5907507838792576 * Smoke_PerDay + -0.25265598530568806 * BMI + -0.18660498640626255 * sleep_hours + -0.11123987871252386 * sitting_hours + 3.8804615679001735e-05 * steps_PerMonth
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
print("R^2 : ", r2_score(Y_test, Y_lin_predict))
print("MAE :", mean_absolute_error(Y_test,Y_lin_predict))
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_lin_predict)))
R^2 : 0.5023051414196489 MAE : 5.736300201227288 RMSE: 7.153884198323651
fig, ax = plt.subplots()
ax.scatter(Y_lin_predict, Y_test, edgecolors=(0, 0, 1))
ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'r--', lw=3)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.show()
Decision trees
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=44,max_depth = 10)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
from sklearn.tree import plot_tree
plt.figure(figsize=(10,8), dpi=150)
plot_tree(model, feature_names=X.columns);
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
print("R^2 : ", r2_score(Y_test, predictions))
print("MAE :", mean_absolute_error(Y_test,predictions))
print("RMSE:",np.sqrt(mean_squared_error(Y_test, predictions)))
R^2 : 0.2088128598057699 MAE : 7.272796033185009 RMSE: 9.019862944281558
fig, ax = plt.subplots()
ax.scatter(predictions, Y_test, edgecolors=(0, 0, 1))
ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'r--', lw=3)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.show()
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=44,max_depth = 3)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
from sklearn.tree import plot_tree
plt.figure(figsize=(10,8), dpi=150)
plot_tree(model, feature_names=X.columns);